% The Computer Society usually requires 10pt for submissions.
%
\documentclass[10pt,journal,compsoc]{IEEEtran}
%
%\hyphenation{op-tical net-works semi-conduc-tor}

\usepackage{amsmath}
\usepackage{color}
\usepackage[]{graphicx}
\usepackage{balance} 
\usepackage{setspace} 
\usepackage{spverbatim} 
\usepackage{algorithm}
\usepackage{algorithmic} 
\newtheorem{definition}{Definition}
\newtheorem{theorem}{Theorem}
\newtheorem{myproof}{Proof}
\newtheorem{lemma}{Lemma}
\newcommand{\argmax}{\operatornamewithlimits{argmax}}

\begin{document}
%
% paper title
% can use linebreaks \\ within to get better formatting as desired
\title{SERIMI: Class-based Matching for Instance Matching Across Heterogeneous Datasets}

\author{Samur Araujo, Duc Thanh Tran, Arjen P. de Vries and Daniel Schwabe 
         
        
% note need leading \protect in front of \\ to get a newline within \thanks as
% \\ is fragile and will error, could use \hfil\break instead.
 
 }

 
\IEEEcompsoctitleabstractindextext{%
\begin{abstract}  
%We study the problem of detecting different instance representations that refer to the same real world entity, also called \emph{instance matching}. 
Based on a detailed analysis, we observed that state-of-the-art instance matching approaches do not perform well when used for matching instances \emph{across heterogeneous datasets}. This is because they are built upon \emph{direct matching}, which involves a direct comparison of 
%instances in the 
a source dataset with 
%instances in the 
a target dataset. This 
%matching paradigm 
is not suitable when the overlap between the datasets is too small. 
%, which is often the case with heterogeneous data. 
%to provide sufficient cues for a direct comparison. 
Aiming at this problem, we propose a new paradigm called \textit{class-based matching}. 
%, which we use in combination with direct matching. 
Given a class of instances from the source dataset, called the \emph{class of interest}, and a set of candidate matches retrieved from the target, 
%(via direct matching), 
class-based matching helps to refine the candidates by filtering out those that do not belong to the class of interest. For this refinement, only data in the target is used, i.e., no direct comparison between source and target is involved. 
%Besides the main idea, we also discuss optimizations to \emph{compactly represent the class of interest} for greater efficiency and a method to \emph{automatically select the threshold} for filtering matches more effectively. 
Based on extensive experiments using 
%ly evaluate our approach
%, called SERIMI, 
%using two 
public benchmarks, 
we show our approach greatly improves the results of state-of-the-art systems especially on hard matching tasks.  
% and several other state-of-the-art systems not covered by the benchmarks. The results suggest that SERIMI uses valuable 
%These \emph{extensive experiments} show that SERIMI yields superior results. The class-based matching achieved competitive results when compared to the direct matching; and most importantly, it was complementary to it when the direct matching presented a low performance. In average, SERIMI outperformed all baselines.  \todo{i added more about results. not number because they are not so impressive.}
\end{abstract}  
 

% Note that keywords are not normally used for peer review papers.
\begin{keywords}
Data integration, Class-based matching, Direct matching, Instance matching, Semantic Web.
\end{keywords}

}


% make the title area
\maketitle


 
%\IEEEdisplaynotcompsoctitleabstractindextext
% \IEEEdisplaynotcompsoctitleabstractindextext has no effect when using
% compsoc under a non-conference mode.


 
%\IEEEpeerreviewmaketitle



\input{sec-introduction}
\input{sec-overview}
\input{sec-approach}
\input{sec-reduction}
\input{sec-threshold}
\input{sec-evaluation1} 
\input{sec-evaluation2}
\input{sec-tables}
\input{sec-related}

\section{Conclusion}
 
In this work, we propose an unsupervised instance matching approach that combines direct-based matching with a novel class-based matching technique to infer Sameas relation over heterogeneous data. 
%This method focuses on determining similarity between instances, specially when there is not enough overlapping among source and target instances. Also, we propose an efficient class-based matching algorithm and a method that uses a statistic outlier detection strategy to eliminate false positive matches from a set of candidates matches. 
We evaluated our method using two public benchmarks: OAEI 2010 and 2011. The results show that we achieved good and competitive results compared to several representative systems focused on instance matching over heterogeneous data.
 

\bibliographystyle{IEEEtran}
\bibliography{journal}

\clearpage 
\pagebreak
\input{sec-appendix}
 
%\begin{IEEEbiographynophoto}{Samur Araujo} is a Phd student at TUDelft. He holds a bachelor and master degree in Computer Science. His research is on data integration over Linked Data.  
%\end{IEEEbiographynophoto}
 

%\begin{IEEEbiographynophoto}{Arjen P. de Vries} is a tenured researcher at CWI leading the Information Access research group, and a full professor (0.2 fte) in the area of multimedia data management at the Technical University of Delft. De Vries studies the intersection of information retrieval and databases. He has held general and programme chair positions at SIGIR 2007, CIKM 2011, ECIR 2012 and ECIR 2014. De Vries is a member of the TREC PC (who coordinated enterprise search and entity retrieval tracks), and a steering committee member of INEX (the Initiative for the Evaluation of XML Retrieval). In November 2009, De Vries co-founded Spinque, a CWI spin-off that provides integrated access to any type of data, customized for information specialist or end user, to produce effective and transparent search results.
%\end{IEEEbiographynophoto}
\end{document}



